package com.yzq.os.spider.v.service.spider.impl;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.yzq.os.spider.v.domain.Record;
import com.yzq.os.spider.v.service.spider.AbstractCrawlTask;
import com.yzq.os.spider.v.util.Regex;

public class DemoCrawlTask extends AbstractCrawlTask {

	@Override
	public void initializationBeforeRun() {
		setForceUseCalculateTurnPage(true);
	}

	@Override
	public List<Record> extractJobs() {
		Document doc = Jsoup.parse(jobsPartHtmlSource);
		Elements jobTitles = doc.select("td.td1 > a");
		Elements companys = doc.select("td.td2 > a");
		Elements citys = doc.select("td.td3 > span");
		Elements dates = doc.select("td.td4 > span");
		List<Record> infos = new ArrayList<Record>();
		for (int i = 0; i < jobTitles.size(); i++) {
			Element jobHyperLink = jobTitles.get(i);
			Element companyHyperLink = companys.get(i);
			Record job = new Record();
			job.setSearchEngineId(searchEngine.getId());
			job.setQueryUrlId(queryURL.getId());
			job.setCityText(citys.get(i).text());
			job.setCompanyName(companyHyperLink.text());
			String date = dates.get(i).text();
			job.setJobDate(parseJobDate(date));
			job.setJobTitle(jobHyperLink.text());
			job.setJobTypeCode(extractJobTypeCode());
			job.setIndustryCode(extractIndustryCode());
			job.setCityCode(extractCityCode());
			job.setJobLinkURL(uniformURLFormat(jobHyperLink.attr("href")));
			String cmptrJobId = Regex.matchSRowSField(job.getJobLinkURL(), "/job/(\\d+),c.html", false);
			if (StringUtils.isBlank(cmptrJobId)) {
				cmptrJobId = Regex.matchSRowSField(job.getJobLinkURL(), "JobID=(\\d+)", false);
			}
			job.setCmptrJobId(cmptrJobId);
			job.setCompanyLinkURL(uniformURLFormat(companyHyperLink.attr("href")));
			if (StringUtils.isNotBlank(job.getCompanyLinkURL())) {
				job.setCmpCompanyId(Regex.matchSRowSField(job.getCompanyLinkURL(), "http://www.xx.com/list/co,c,(\\d+),", false));
			}
			infos.add(job);
		}
		return infos;
	}

}
